{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from gensim.models.word2vec import Word2Vec\n",
    "from gensim.models import word2vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "INPUT_PATH = '../input/'\n",
    "CACHE_PATH = '../cache/'\n",
    "OUTPUT_PATH ='../output/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 读取数据\n",
    "df_train = pd.read_csv(CACHE_PATH + 'train_processed_all.csv')\n",
    "df_predict = pd.read_csv(CACHE_PATH + 'predict_processed_all.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "EMBEDDING_SIZE = 300\n",
    "EMBEDDING_FILE_NAME = CACHE_PATH  + 'chinese_vector_300.txt'\n",
    "CORPUS_FILE_NAME = CACHE_PATH  + 'corpus_all.txt'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 获取语库\n",
    "corpus = np.concatenate([df_train['Discuss'],df_predict['Discuss']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 保存文本集\n",
    "with open(CORPUS_FILE_NAME,'w') as file_vec:\n",
    "    for line in corpus:\n",
    "        file_vec.writelines(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 训练word2vec\n",
    "sentences = word2vec.Text8Corpus(CORPUS_FILE_NAME)\n",
    "model = Word2Vec(sentences, min_count=20, size= EMBEDDING_SIZE, window=6)\n",
    "model.wv.save_word2vec_format(EMBEDDING_FILE_NAME, binary=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/fwj/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar_cosmul` (Method will be removed in 4.0.0, use self.wv.most_similar_cosmul() instead).\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('北京市', 0.7834532856941223),\n",
       " ('帝都', 0.7675825357437134),\n",
       " ('南京', 0.7481988668441772),\n",
       " ('首都', 0.7471380233764648),\n",
       " ('北京城', 0.7414585947990417),\n",
       " ('长春', 0.7413082718849182),\n",
       " ('龙头', 0.7310134172439575),\n",
       " ('京城', 0.7299156785011292),\n",
       " ('前门', 0.7274509072303772),\n",
       " ('故宫', 0.7273288369178772)]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.most_similar_cosmul('北京')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/fwj/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "300"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(model['北'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
